@article{zheng2025pptagent,
  title={Pptagent: Generating and evaluating presentations beyond text-to-slides},
  author={Zheng, Hao and Guan, Xinyan and Kong, Hao and Zheng, Jia and Zhou, Weixiang and Lin, Hongyu and Lu, Yaojie and He, Ben and Han, Xianpei and Sun, Le},
  journal={arXiv preprint arXiv:2501.03936},
  year={2025}
}

@article{pang2025paper2poster,
  title={Paper2Poster: Towards Multimodal Poster Automation from Scientific Papers},
  author={Pang, Wei and Lin, Kevin Qinghong and Jian, Xiangru and He, Xi and Torr, Philip},
  journal={arXiv preprint arXiv:2505.21497},
  year={2025}
}

@techreport{deepmind2025veo3,
  title        = {Veo 3 Technical Report},
  author       = {{DeepMind}},
  year         = {2025},
  month        = may,
  institution  = {DeepMind},
  note         = {Technical Report},
  url          = {https://storage.googleapis.com/deepmind-media/veo/Veo-3-Tech-Report.pdf}
}

@article{sun2021d2s,
  title={D2S: Document-to-slide generation via query-based text summarization},
  author={Sun, Edward and Hou, Yufang and Wang, Dakuo and Zhang, Yunfeng and Wang, Nancy XR},
  journal={arXiv preprint arXiv:2105.03664},
  year={2021}
}

@article{ma2025controllable,
  title={Controllable video generation: A survey},
  author={Ma, Yue and Feng, Kunyu and Hu, Zhongyuan and Wang, Xinyu and Wang, Yucheng and Zheng, Mingzhe and He, Xuanhua and Zhu, Chenyang and Liu, Hongyu and He, Yingqing and others},
  journal={arXiv preprint arXiv:2507.16869},
  year={2025}
}

@inproceedings{lin2025showui,
  title={Showui: One vision-language-action model for gui visual agent},
  author={Lin, Kevin Qinghong and Li, Linjie and Gao, Difei and Yang, Zhengyuan and Wu, Shiwei and Bai, Zechen and Lei, Stan Weixian and Wang, Lijuan and Shou, Mike Zheng},
  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages={19498--19508},
  year={2025}
}

@article{qin2025ui,
  title={Ui-tars: Pioneering automated gui interaction with native agents},
  author={Qin, Yujia and Ye, Yining and Fang, Junjie and Wang, Haoming and Liang, Shihao and Tian, Shizuo and Zhang, Junda and Li, Jiahao and Li, Yunxin and Huang, Shijue and others},
  journal={arXiv preprint arXiv:2501.12326},
  year={2025}
}

@article{bain2023whisperx,
  title={Whisperx: Time-accurate speech transcription of long-form audio},
  author={Bain, Max and Huh, Jaesung and Han, Tengda and Zisserman, Andrew},
  journal={arXiv preprint arXiv:2303.00747},
  year={2023}
}

@article{chen2024f5,
  title={F5-tts: A fairytaler that fakes fluent and faithful speech with flow matching},
  author={Chen, Yushen and Niu, Zhikang and Ma, Ziyang and Deng, Keqi and Wang, Chunhui and Zhao, Jian and Yu, Kai and Chen, Xie},
  journal={arXiv preprint arXiv:2410.06885},
  year={2024}
}

@article{cui2024hallo2,
  title={Hallo2: Long-duration and high-resolution audio-driven portrait image animation},
  author={Cui, Jiahao and Li, Hui and Yao, Yao and Zhu, Hao and Shang, Hanlin and Cheng, Kaihui and Zhou, Hang and Zhu, Siyu and Wang, Jingdong},
  journal={arXiv preprint arXiv:2410.07718},
  year={2024}
}

@inproceedings{rombach2022high,
  title={High-resolution image synthesis with latent diffusion models},
  author={Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj{\"o}rn},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={10684--10695},
  year={2022}
}

@article{wu2025automated,
  title={Automated movie generation via multi-agent cot planning},
  author={Wu, Weijia and Zhu, Zeyu and Shou, Mike Zheng},
  journal={arXiv preprint arXiv:2503.07314},
  year={2025}
}

@article{shi2025presentagent,
  title={Presentagent: Multimodal agent for presentation video generation},
  author={Shi, Jingwei and Zhang, Zeyu and Wu, Biao and Liang, Yanjie and Fang, Meng and Chen, Ling and Zhao, Yang},
  journal={arXiv preprint arXiv:2507.04036},
  year={2025}
}

@article{ai4research,
  title={AI4Research: A Survey of Artificial Intelligence for Scientific Research},
  author={Chen, Qiguang and Yang, Mingda and Qin, Libo and Liu, Jinhao and Yan, Zheng and Guan, Jiannan and Peng, Dengyun and Ji, Yiyan and Li, Hanjing and Hu, Mengkang and others},
  journal={arXiv preprint arXiv:2507.01903},
  year={2025}
}

@inproceedings{writing_ass,
    title = "Automated Focused Feedback Generation for Scientific Writing Assistance",
    author = "Chamoun, Eric  and
      Schlichtkrull, Michael  and
      Vlachos, Andreas",
    editor = "Ku, Lun-Wei  and
      Martins, Andre  and
      Srikumar, Vivek",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
    month = aug,
    year = "2024",
    address = "Bangkok, Thailand",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-acl.580/",
    doi = "10.18653/v1/2024.findings-acl.580",
    pages = "9742--9763",
    abstract = "Scientific writing is a challenging task, particularly for novice researchers who often rely on feedback from experienced peers. Recent work has primarily focused on improving surface form and style rather than manuscript content. In this paper, we propose a novel task: automated focused feedback generation for scientific writing assistance. We present SWIF$^2$T: a Scientific WrIting Focused Feedback Tool. It is designed to generate specific, actionable and coherent comments, which identify weaknesses in a scientific paper and/or propose revisions to it. Our approach consists of four components - planner, investigator, reviewer and controller - leveraging multiple Large Language Models (LLMs) to implement them. We compile a dataset of 300 peer reviews citing weaknesses in scientific papers and conduct human evaluation. The results demonstrate the superiority in specificity, reading comprehension, and overall helpfulness of SWIF$^2$T{'}s feedback compared to other approaches. In our analysis, we also identified cases where automatically generated reviews were judged better than human ones, suggesting opportunities for integration of AI-generated feedback in scientific writing."
}

@article{sci_lit,
  title={Knowledge navigator: Llm-guided browsing framework for exploratory search in scientific literature},
  author={Katz, Uri and Levy, Mosh and Goldberg, Yoav},
  journal={arXiv preprint arXiv:2408.15836},
  year={2024}
}

@inproceedings{hu2024novachart,
  title={Novachart: A large-scale dataset towards chart understanding and generation of multimodal large language models},
  author={Hu, Linmei and Wang, Duokang and Pan, Yiming and Yu, Jifan and Shao, Yingxia and Feng, Chong and Nie, Liqiang},
  booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
  pages={3917--3925},
  year={2024}
}

@article{fantasytalking,
  title={Fantasytalking: Realistic talking portrait generation via coherent motion synthesis},
  author={Wang, Mengchao and Wang, Qiang and Jiang, Fan and Fan, Yaqi and Zhang, Yunpeng and Qi, Yonggang and Zhao, Kun and Xu, Mu},
  journal={arXiv preprint arXiv:2504.04842},
  year={2025}
}

@article{tts-f5,
  title={F5-tts: A fairytaler that fakes fluent and faithful speech with flow matching},
  author={Chen, Yushen and Niu, Zhikang and Ma, Ziyang and Deng, Keqi and Wang, Chunhui and Zhao, Jian and Yu, Kai and Chen, Xie},
  journal={arXiv preprint arXiv:2410.06885},
  year={2024}
}

@article{audio_embedding,
  author  = {Mirco Ravanelli and Titouan Parcollet and Adel Moumen and Sylvain de Langen and Cem Subakan and Peter Plantinga and Yingzhi Wang and Pooneh Mousavi and Luca Della Libera and Artem Ploujnikov and Francesco Paissan and Davide Borra and Salah Zaiem and Zeyu Zhao and Shucong Zhang and Georgios Karakasidis and Sung-Lin Yeh and Pierre Champion and Aku Rouhe and Rudolf Braun and Florian Mai and Juan Zuluaga-Gomez and Seyed Mahed Mousavi and Andreas Nautsch and Ha Nguyen and Xuechen Liu and Sangeet Sagar and Jarod Duret and Salima Mdhaffar and Ga{{\"e}}lle Laperri{{\`e}}re and Mickael Rouvier and Renato De Mori and Yannick Est{{\`e}}ve},
  title   = {Open-Source Conversational AI with SpeechBrain 1.0},
  journal = {Journal of Machine Learning Research},
  year    = {2024},
  volume  = {25},
  number  = {333},
  url     = {http://jmlr.org/papers/v25/24-0991.html}
}

@article{paper2code,
  title={Paper2code: Automating code generation from scientific papers in machine learning},
  author={Seo, Minju and Baek, Jinheon and Lee, Seongyun and Hwang, Sung Ju},
  journal={arXiv preprint arXiv:2504.17192},
  year={2025}
}

@inproceedings{wu2025moviebench,
  title={Moviebench: A hierarchical movie level dataset for long video generation},
  author={Wu, Weijia and Liu, Mingyu and Zhu, Zeyu and Xia, Xi and Feng, Haoen and Wang, Wen and Lin, Kevin Qinghong and Shen, Chunhua and Shou, Mike Zheng},
  booktitle={Proceedings of the Computer Vision and Pattern Recognition Conference},
  pages={28984--28994},
  year={2025}
}

@inproceedings{vbench,
  title={Vbench: Comprehensive benchmark suite for video generative models},
  author={Huang, Ziqi and He, Yinan and Yu, Jiashuo and Zhang, Fan and Si, Chenyang and Jiang, Yuming and Zhang, Yuanhan and Wu, Tianxing and Jin, Qingyang and Chanpaisit, Nattapol and others},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={21807--21818},
  year={2024}
}

@article{wan,
  title={Wan: Open and advanced large-scale video generative models},
  author={Wan, Team and Wang, Ang and Ai, Baole and Wen, Bin and Mao, Chaojie and Xie, Chen-Wei and Chen, Di and Yu, Feiwu and Zhao, Haiming and Yang, Jianxiao and others},
  journal={arXiv preprint arXiv:2503.20314},
  year={2025}
}

@inproceedings{camel,
  title={CAMEL: Communicative Agents for "Mind" Exploration of Large Language Model Society},
  author={Li, Guohao and Hammoud, Hasan Abed Al Kader and Itani, Hani and Khizbullin, Dmitrii and Ghanem, Bernard},
  booktitle={Thirty-seventh Conference on Neural Information Processing Systems},
  year={2023}
}
@article{paperbench,
  title={PaperBench: Evaluating AI's Ability to Replicate AI Research},
  author={Starace, Giulio and Jaffe, Oliver and Sherburn, Dane and Aung, James and Chan, Jun Shern and Maksin, Leon and Dias, Rachel and Mays, Evan and Kinsella, Benjamin and Thompson, Wyatt and others},
  journal={arXiv preprint arXiv:2504.01848},
  year={2025}
}

@inproceedings{talking-head-1,
  title={Flowvqtalker: High-quality emotional talking face generation through normalizing flow and quantization},
  author={Tan, Shuai and Ji, Bin and Pan, Ye},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={26317--26327},
  year={2024}
}

@inproceedings{talking-head-2,
  title={Learning individual styles of conversational gesture},
  author={Ginosar, Shiry and Bar, Amir and Kohavi, Gefen and Chan, Caroline and Owens, Andrew and Malik, Jitendra},
  booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  pages={3497--3506},
  year={2019}
}

@inproceedings{evalcrafter,
  title={Evalcrafter: Benchmarking and evaluating large video generation models},
  author={Liu, Yaofang and Cun, Xiaodong and Liu, Xuebo and Wang, Xintao and Zhang, Yong and Chen, Haoxin and Liu, Yang and Zeng, Tieyong and Chan, Raymond and Shan, Ying},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  pages={22139--22149},
  year={2024}
}

@article{vbench++,
  title={Vbench++: Comprehensive and versatile benchmark suite for video generative models},
  author={Huang, Ziqi and Zhang, Fan and Xu, Xiaojie and He, Yinan and Yu, Jiashuo and Dong, Ziyue and Ma, Qianli and Chanpaisit, Nattapol and Si, Chenyang and Jiang, Yuming and others},
  journal={arXiv preprint arXiv:2411.13503},
  year={2024}
}
@inproceedings{dasigi-etal,
    title = "A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers",
    author = "Dasigi, Pradeep  and
      Lo, Kyle  and
      Beltagy, Iz  and
      Cohan, Arman  and
      Smith, Noah A.  and
      Gardner, Matt",
    editor = "Toutanova, Kristina  and
      Rumshisky, Anna  and
      Zettlemoyer, Luke  and
      Hakkani-Tur, Dilek  and
      Beltagy, Iz  and
      Bethard, Steven  and
      Cotterell, Ryan  and
      Chakraborty, Tanmoy  and
      Zhou, Yichao",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2021.naacl-main.365/",
    doi = "10.18653/v1/2021.naacl-main.365",
    pages = "4599--4610",
    abstract = "Readers of academic research papers often read with the goal of answering specific questions. Question Answering systems that can answer those questions can make consumption of the content much more efficient. However, building such tools requires data that reflect the difficulty of the task arising from complex reasoning about claims made in multiple parts of a paper. In contrast, existing information-seeking question answering datasets usually contain questions about generic factoid-type information. We therefore present Qasper, a dataset of 5049 questions over 1585 Natural Language Processing papers. Each question is written by an NLP practitioner who read only the title and abstract of the corresponding paper, and the question seeks information present in the full text. The questions are then answered by a separate set of NLP practitioners who also provide supporting evidence to answers. We find that existing models that do well on other QA tasks do not perform well on answering these questions, underperforming humans by at least 27 F1 points when answering them from entire papers, motivating further research in document-grounded, information-seeking QA, which our dataset is designed to facilitate."
}

@article{scireplicate,
  title={Scireplicate-bench: Benchmarking llms in agent-driven algorithmic reproduction from research papers},
  author={Xiang, Yanzheng and Yan, Hanqi and Ouyang, Shuyin and Gui, Lin and He, Yulan},
  journal={arXiv preprint arXiv:2504.00255},
  year={2025}
}

@article{scienceagentbench,
  title={Scienceagentbench: Toward rigorous assessment of language agents for data-driven scientific discovery},
  author={Chen, Ziru and Chen, Shijie and Ning, Yuting and Zhang, Qianheng and Wang, Boshi and Yu, Botao and Li, Yifei and Liao, Zeyi and Wei, Chen and Lu, Zitong and others},
  journal={arXiv preprint arXiv:2410.05080},
  year={2024}
}

@article{bixbench,
  title={Bixbench: a comprehensive benchmark for llm-based agents in computational biology},
  author={Mitchener, Ludovico and Laurent, Jon M and Tenmann, Benjamin and Narayanan, Siddharth and Wellawatte, Geemi P and White, Andrew and Sani, Lorenzo and Rodriques, Samuel G},
  journal={arXiv preprint arXiv:2503.00096},
  year={2025}
}

@article{Llm-srbench,
  title={Llm-srbench: A new benchmark for scientific equation discovery with large language models},
  author={Shojaee, Parshin and Nguyen, Ngoc-Hieu and Meidani, Kazem and Farimani, Amir Barati and Doan, Khoa D and Reddy, Chandan K},
  journal={arXiv preprint arXiv:2504.10415},
  year={2025}
}

@article{spiqa,
  title={Spiqa: A dataset for multimodal question answering on scientific papers},
  author={Pramanick, Shraman and Chellappa, Rama and Venugopalan, Subhashini},
  journal={Advances in Neural Information Processing Systems},
  volume={37},
  pages={118807--118833},
  year={2024}
}

@article{deyoung2021ms2,
  title={Ms2: Multi-document summarization of medical studies},
  author={DeYoung, Jay and Beltagy, Iz and van Zuylen, Madeleine and Kuehl, Bailey and Wang, Lucy Lu},
  journal={arXiv preprint arXiv:2104.06486},
  year={2021}
}

@inproceedings{multi-xscience,
    title = "Multi-{XS}cience: A Large-scale Dataset for Extreme Multi-document Summarization of Scientific Articles",
    author = "Lu, Yao  and
      Dong, Yue  and
      Charlin, Laurent",
    editor = "Webber, Bonnie  and
      Cohn, Trevor  and
      He, Yulan  and
      Liu, Yang",
    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2020.emnlp-main.648/",
    doi = "10.18653/v1/2020.emnlp-main.648",
    pages = "8068--8074",
    abstract = "Multi-document summarization is a challenging task for which there exists little large-scale datasets. We propose Multi-XScience, a large-scale multi-document summarization dataset created from scientific articles. Multi-XScience introduces a challenging multi-document summarization task: writing the related-work section of a paper based on its abstract and the articles it references. Our work is inspired by extreme summarization, a dataset construction protocol that favours abstractive modeling approaches. Descriptive statistics and empirical results{---}using several state-of-the-art models trained on the Multi-XScience dataset{---}reveal that Multi-XScience is well suited for abstractive models."
}

@article{goldsack2022making,
  title={Making science simple: Corpora for the lay summarisation of scientific literature},
  author={Goldsack, Tomas and Zhang, Zhihao and Lin, Chenghua and Scarton, Carolina},
  journal={arXiv preprint arXiv:2210.09932},
  year={2022}
}

@article{IconShop,
author = {Wu, Ronghuan and Su, Wanchao and Ma, Kede and Liao, Jing},
title = {IconShop: Text-Guided Vector Icon Synthesis with Autoregressive Transformers},
year = {2023},
issue_date = {December 2023},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {42},
number = {6},
issn = {0730-0301},
url = {https://doi.org/10.1145/3618364},
doi = {10.1145/3618364},
abstract = {Scalable Vector Graphics (SVG) is a popular vector image format that offers good support for interactivity and animation. Despite its appealing characteristics, creating custom SVG content can be challenging for users due to the steep learning curve required to understand SVG grammars or get familiar with professional editing software. Recent advancements in text-to-image generation have inspired researchers to explore vector graphics synthesis using either image-based methods (i.e., text → raster image → vector graphics) combining text-to-image generation models with image vectorization, or language-based methods (i.e., text → vector graphics script) through pretrained large language models. Nevertheless, these methods suffer from limitations in terms of generation quality, diversity, and flexibility. In this paper, we introduce IconShop, a text-guided vector icon synthesis method using autoregressive transformers. The key to success of our approach is to sequentialize and tokenize SVG paths (and textual descriptions as guidance) into a uniquely decodable token sequence. With that, we are able to exploit the sequence learning power of autoregressive transformers, while enabling both unconditional and text-conditioned icon synthesis. Through standard training to predict the next token on a large-scale vector icon dataset accompanied by textural descriptions, the proposed IconShop consistently exhibits better icon synthesis capability than existing image-based and language-based methods both quantitatively (using the FID and CLIP scores) and qualitatively (through formal subjective user studies). Meanwhile, we observe a dramatic improvement in generation diversity, which is validated by the objective Uniqueness and Novelty measures. More importantly, we demonstrate the flexibility of IconShop with multiple novel icon synthesis tasks, including icon editing, icon interpolation, icon semantic combination, and icon design auto-suggestion.},
journal = {ACM Trans. Graph.},
month = dec,
articleno = {230},
numpages = {14},
keywords = {SVG, autoregressive transformers, icon synthesis, text-guided generation, vector graphics generation}
}

@article{sd_video,
  title={Stable video diffusion: Scaling latent video diffusion models to large datasets},
  author={Blattmann, Andreas and Dockhorn, Tim and Kulal, Sumith and Mendelevitch, Daniel and Kilian, Maciej and Lorenz, Dominik and Levi, Yam and English, Zion and Voleti, Vikram and Letts, Adam and others},
  journal={arXiv preprint arXiv:2311.15127},
  year={2023}
}

@article{show_1,
  title={Show-1: Marrying pixel and latent diffusion models for text-to-video generation},
  author={Zhang, David Junhao and Wu, Jay Zhangjie and Liu, Jia-Wei and Zhao, Rui and Ran, Lingmin and Gu, Yuchao and Gao, Difei and Shou, Mike Zheng},
  journal={International Journal of Computer Vision},
  volume={133},
  number={4},
  pages={1879--1893},
  year={2025},
  publisher={Springer}
}

@article{paper2agent,
  title={Paper2Agent: Reimagining Research Papers As Interactive and Reliable AI Agents},
  author={Miao, Jiacheng and Davis, Joe R and Pritchard, Jonathan K and Zou, James},
  journal={arXiv preprint arXiv:2509.06917},
  year={2025}
}